# read in the data and rename the column name to align with cloud data in the data dictionaries
image1<- as.data.frame(read.table("image_data/image1.txt"))
image1$picture<- rep("image1", nrow(image1))
colnames(image1) [1:11] <- c("y_coordinate", "x_coordinate", "label",
"NDAI", "SD", "CORR", "DF_angle", "CF_angle" ,
"BF_angle", "AF_angle", "AN_angle")
image2<- as.data.frame(read.table("image_data/image2.txt"))
image2$picture<- rep("image2", nrow(image2))
colnames(image2) [1:11] <- c("y_coordinate", "x_coordinate", "label",
"NDAI", "SD", "CORR", "DF_angle",
"CF_angle" , "BF_angle", "AF_angle", "AN_angle")
image3<- as.data.frame(read.table("image_data/image3.txt"))
image3$picture<- rep("image3", nrow(image3))
colnames(image3) [1:11] <- c("y_coordinate", "x_coordinate", "label",
"NDAI", "SD", "CORR", "DF_angle",
"CF_angle" , "BF_angle", "AF_angle", "AN_angle")
total_df <- rbind(image1, image2, image3)
ggplot(image1)+ geom_point(alpha = 0.5, aes(x= x_coordinate, y = y_coordinate,
color = factor(label)))+
xlab(" x coordinate") +
ylab(" y coordinate") +
ggtitle("Coordinate Map image1 with label ")+
theme_bw()
ggplot(image2)+ geom_point(alpha = 0.5, aes(x= x_coordinate, y = y_coordinate,
color = factor(label)))+
xlab(" x coordinate") +
ylab(" y coordinate") +
ggtitle("Coordinate Map image2 with label ")+
theme_bw()
ggplot(image3)+ geom_point(alpha = 0.5, aes(x= x_coordinate, y = y_coordinate,
color = factor(label)))+
xlab(" x coordinate") +
ylab(" y coordinate") +
ggtitle("Coordinate Map image3 with label ")+
theme_bw()
#Check if there's any missing data
sum(is.na(total_df))
## [1] 0
#There is no missing value
summary(total_df)
## y_coordinate x_coordinate label NDAI
## Min. : 2.0 Min. : 65.0 Min. :-1.0000 Min. :-1.8420
## 1st Qu.: 98.0 1st Qu.:143.0 1st Qu.:-1.0000 1st Qu.:-0.4286
## Median :193.0 Median :218.0 Median : 0.0000 Median : 1.3476
## Mean :193.1 Mean :218.1 Mean :-0.1334 Mean : 1.0847
## 3rd Qu.:289.0 3rd Qu.:294.0 3rd Qu.: 0.0000 3rd Qu.: 2.3142
## Max. :383.0 Max. :369.0 Max. : 1.0000 Max. : 4.5639
## SD CORR DF_angle CF_angle
## Min. : 0.1987 Min. :-0.3872 Min. : 45.28 Min. : 31.19
## 1st Qu.: 1.6376 1st Qu.: 0.1253 1st Qu.:244.56 1st Qu.:219.27
## Median : 4.3095 Median : 0.1603 Median :281.91 Median :259.31
## Mean : 8.0633 Mean : 0.1860 Mean :271.36 Mean :246.37
## 3rd Qu.: 10.2264 3rd Qu.: 0.2231 3rd Qu.:300.39 3rd Qu.:279.59
## Max. :117.5810 Max. : 0.8144 Max. :410.53 Max. :360.68
## BF_angle AF_angle AN_angle picture
## Min. : 24.49 Min. : 21.07 Min. : 20.57 Length:345556
## 1st Qu.:200.79 1st Qu.:185.16 1st Qu.:174.88 Class :character
## Median :236.17 Median :211.54 Median :197.58 Mode :character
## Mean :224.20 Mean :201.71 Mean :188.29
## 3rd Qu.:258.62 3rd Qu.:235.15 3rd Qu.:216.80
## Max. :335.08 Max. :318.70 Max. :306.93
#Calculate the proportion
percentage_label<- total_df %>%
group_by(picture, label) %>%
summarise (n = n()) %>%
mutate(freq = n / sum(n))
percentage_label
g1 <- ggplot(data=percentage_label[percentage_label$picture=='image1',],
aes(x=label, y=freq)) +
geom_bar(stat="identity", color="black", fill="lightskyblue3")+
ylab("Proportion")+
ylim(0,0.6)
g2 <- ggplot(data=percentage_label[percentage_label$picture=='image2',],
aes(x=label, y=freq)) +
geom_bar(stat="identity", color="black", fill="lightskyblue3")+
ylab(NULL)+
ylim(0,0.6)
g3 <- ggplot(data=percentage_label[percentage_label$picture=='image3',],
aes(x=label, y=freq)) +
geom_bar(stat="identity", color="black", fill="lightskyblue3")+
ylab(NULL)+
ylim(0,0.6)
grid.arrange(g1,g2,g3,ncol=3,top = textGrob("Proportion of label in each image",
gp = gpar(fontsize = 16)))
labels<- factor(total_df$label)
cor(total_df[,c(-12)])
## y_coordinate x_coordinate label NDAI SD
## y_coordinate 1.000000000 -0.006376002 -0.213372321 -0.3276781 -0.2646997
## x_coordinate -0.006376002 1.000000000 -0.465822566 -0.5231960 -0.3233889
## label -0.213372321 -0.465822566 1.000000000 0.6169346 0.2954477
## NDAI -0.327678096 -0.523195958 0.616934624 1.0000000 0.6310626
## SD -0.264699672 -0.323388894 0.295447745 0.6310626 1.0000000
## CORR -0.254309102 -0.361793127 0.444059231 0.4034998 0.2968385
## DF_angle 0.378983970 0.081274648 0.006550085 -0.1610916 -0.2061691
## CF_angle 0.472777594 0.269869879 -0.208279170 -0.3622113 -0.3688601
## BF_angle 0.520812966 0.339539816 -0.337948500 -0.4629301 -0.4404404
## AF_angle 0.530441793 0.368160603 -0.389741017 -0.4927484 -0.4555423
## AN_angle 0.515677887 0.383211616 -0.389358825 -0.4895267 -0.4466229
## CORR DF_angle CF_angle BF_angle AF_angle
## y_coordinate -0.2543091 0.378983970 0.4727776 0.5208130 0.5304418
## x_coordinate -0.3617931 0.081274648 0.2698699 0.3395398 0.3681606
## label 0.4440592 0.006550085 -0.2082792 -0.3379485 -0.3897410
## NDAI 0.4034998 -0.161091626 -0.3622113 -0.4629301 -0.4927484
## SD 0.2968385 -0.206169130 -0.3688601 -0.4404404 -0.4555423
## CORR 1.0000000 0.126283481 -0.1660966 -0.4311043 -0.6039353
## DF_angle 0.1262835 1.000000000 0.8495716 0.6991073 0.5910958
## CF_angle -0.1660966 0.849571562 1.0000000 0.9119430 0.8216176
## BF_angle -0.4311043 0.699107255 0.9119430 1.0000000 0.9529627
## AF_angle -0.6039353 0.591095794 0.8216176 0.9529627 1.0000000
## AN_angle -0.6820440 0.547609821 0.7727499 0.9043409 0.9706198
## AN_angle
## y_coordinate 0.5156779
## x_coordinate 0.3832116
## label -0.3893588
## NDAI -0.4895267
## SD -0.4466229
## CORR -0.6820440
## DF_angle 0.5476098
## CF_angle 0.7727499
## BF_angle 0.9043409
## AF_angle 0.9706198
## AN_angle 1.0000000
corrplot.mixed(cor(total_df[,c(-3, -12)]))
G<- ggplot(total_df)+geom_density(aes(x= NDAI, group= label,
color=labels, fill= labels),
color= "black",alpha = 0.7)+
ggtitle("Overlaying histogram of NDAI based on labels")+
theme_minimal()
H<- ggplot(total_df)+geom_density(aes(x= log(SD), group= label,
color=labels, fill= labels),
color= "black",alpha = 0.7)+
ggtitle("Overlaying histogram of Log SD based on labels")+
theme_minimal()
I<- ggplot(total_df)+geom_density(aes(x= CORR, group= label,
color=labels, fill= labels),
color= "black",alpha = 0.7)+
ggtitle("Overlaying histogram of CORR based on labels")+
theme_minimal()
A<- ggplot(total_df)+geom_density(aes(x= DF_angle, group= label,
color=labels, fill= labels),
color= "black",alpha = 0.7)+
ggtitle("Overlaying histogram of DF_angle based on labels")+
theme_minimal()
B<- ggplot(total_df)+geom_density(aes(x= CF_angle, group= label,
color=labels, fill= labels),
color= "black",alpha = 0.7)+
ggtitle("Overlaying histogram of CF_angle based on labels")+
theme_minimal()
C<- ggplot(total_df)+geom_density(aes(x= BF_angle, group= label,
color=labels, fill= labels),
color= "black",alpha = 0.7)+
ggtitle("Overlaying histogram of BF_angle based on labels")+
theme_minimal()
D<-ggplot(total_df)+geom_density(aes(x= AF_angle, group= label,
color=labels, fill= labels),
color= "black",alpha = 0.7)+
ggtitle("Overlaying histogram of AF_angle based on labels")+
theme_minimal()
E<- ggplot(total_df)+geom_density(aes(x= AN_angle, group= label,
color=labels, fill= labels),
color= "black",alpha = 0.7)+
ggtitle("Overlaying histogram of AN_angle based on labels")+
theme_minimal()
plot_grid(B,C,D,E,nrow=2, align="h")
plot_grid(A,G,H,I,nrow=2, align="h")
# Filter all the undefined label
image1<- image1 %>% filter(label !=0)
image2<- image2 %>% filter(label !=0)
image3<- image3 %>% filter(label !=0)
traintest_split<- function(data){
res<-list()
trainIndex <- createDataPartition(data$label, p = .8,
list = FALSE,
times = 1)
res$train<- data[ trainIndex,]
res$test<- data[-trainIndex,]
return(res)
}
trainval_split<- function(data){
res<-list()
valIndex <- createDataPartition(data$label, p = .2,
list = FALSE,
times = 1)
res$val<- data[ valIndex,]
res$train<- data[-valIndex,]
return(res)
}
##image 1
#Test-train split
train1_label_split1<- traintest_split(image1)$train
# get test from train-test split
test_label_split1<- traintest_split(image1)$test
#Train-val split ---get val
val_label_split1<- trainval_split(train1_label_split1)$val
#Get train
train_label_split1<-trainval_split(train1_label_split1)$train
##image 2
train2_label_split2<- traintest_split(image2)$train
# get test from train-test split
test_label_split2<- traintest_split(image2)$test
#Train-val split ---get val
val_label_split2<- trainval_split(train2_label_split2)$val
#Get train
train_label_split2<-trainval_split(train2_label_split2)$train
##image 3
train3_label_split3<- traintest_split(image3)$train
# get test from train-test split
test_label_split3<- traintest_split(image3)$test
#Train-val split ---get val
val_label_split3<- trainval_split(train3_label_split3)$val
#Get train
train_label_split3<-trainval_split(train3_label_split3)$train
method1_train<-rbind(train_label_split1,train_label_split2,train_label_split3)
method1_val<-rbind(val_label_split1,val_label_split2,val_label_split3)
method1_test<- rbind(test_label_split1,test_label_split2,test_label_split3)
method1_train_val<-rbind(method1_train,method1_val)
method1_train_val$label<-factor(method1_train_val$label)
#Check Duplicates
image1 %>% distinct()
grid_row <- 10
grid_col <- 10
datasplit<- function(grid_row, grid_col, image){
ret <- list()
train_data <- data.frame()
val_data <- data.frame()
test_data <- data.frame()
min_y_cor <- min(image$y_coordinate)
min_x_cor <- min(image$x_coordinate)
max_y_cor <- max(image$y_coordinate)
max_x_cor <- max(image$x_coordinate)
divide_row <- seq(min_y_cor,max_y_cor+1,(max_y_cor+1-min_y_cor)/(grid_row-1))
divide_col <- seq(min_x_cor,max_x_cor+1,(max_x_cor+1-min_x_cor)/(grid_col-1))
for (i in 1:length(divide_row)){
for (j in 1:length(divide_col)){
chunk <- image[image$y_coordinate>=divide_row[i] & image$y_coordinate<divide_row[i+1] &
image$x_coordinate>=divide_col[j] & image$x_coordinate<divide_col[j+1], ]
test_and_val <- floor(nrow(chunk)*0.4)
test_and_val_ind <- sample(seq_len(nrow(chunk)), size = test_and_val)
val_size <- floor(test_and_val/2)
val_ind_ind <- sample(length(test_and_val_ind), size = val_size)
val_ind <- test_and_val_ind[val_ind_ind]
test_ind <- test_and_val_ind[-val_ind_ind]
test<- chunk[test_ind, ]
val<- chunk[val_ind, ]
train<- chunk[-test_and_val_ind, ]
train_data = rbind(train_data,train)
test_data = rbind(test_data,test)
val_data = rbind(val_data,val)
}
}
ret$training <- train_data
ret$validation <- val_data
ret$test <- test_data
return(ret)
}
split_result_1 <- datasplit(10,10,image1)
split_result_2 <- datasplit(10,10,image2)
split_result_3 <- datasplit(10,10,image3)
train_total<- rbind(split_result_1$training,split_result_2$training,split_result_3$training)
val_total<- rbind(split_result_1$validation,split_result_2$validation,split_result_3$validation)
test_total<- rbind(split_result_1$test,split_result_2$test,split_result_3$test)
train_total$picture<- NULL
val_total$picture<- NULL
test_total$picture<-NULL
train_total$label<-factor(train_total$label)
test_trivial<- test_total
test_trivial$label<--1
train_trivial<- train_total
train_trivial$label<--1
val_trivial<- val_total
val_trivial$label<--1
sum(test_trivial$label ==test_total$label)/length(test_total$label)
## [1] 0.6095732
sum(val_trivial$label ==val_total$label)/length(val_trivial$label)
## [1] 0.6107271
sum(train_trivial$label ==train_total$label)/length(train_trivial$label)
## [1] 0.6112134
train_val<- rbind(train_total, val_total)
train_val$label<- as.factor(train_val$label)
test_total$label<- as.factor(test_total$label)
#visualization by using boxplot
A<-ggplot(train_val)+geom_boxplot(aes(x=factor(label), y= y_coordinate,
color= factor(label)))+ theme_bw()+
ggtitle("y_coordinate and label")+xlab("label")+
theme(plot.title = element_text(size = rel(1),
vjust = 1.5,face="bold.italic"))+
theme(legend.position="none")
B<-ggplot(train_val)+geom_boxplot(aes(x=factor(label), y= x_coordinate,
color= factor(label)))+ theme_bw()+
ggtitle("x_coordinate and label")+theme(legend.position="none")+
xlab("label")+theme(plot.title = element_text(size = rel(1),
vjust = 1.5,face="bold.italic"))
C<-ggplot(train_val)+geom_boxplot(aes(x=factor(label), y= NDAI,
color= factor(label)))+ theme_bw()+
ggtitle("label and NDAI")+theme(legend.position="none")+
xlab("label")+theme(plot.title = element_text(size = rel(1),
vjust = 1.5,face="bold.italic"))
D<-ggplot(train_val)+geom_boxplot(aes(x=factor(label), y= SD,color= factor(label)))+ theme_bw()+
ggtitle("label and SD")+theme(legend.position="none")+
xlab("label")+theme(plot.title = element_text(size = rel(1),
vjust = 1.5,face="bold.italic"))
E<-ggplot(train_val)+geom_boxplot(aes(x=factor(label), y= CORR,color= factor(label)))+ theme_bw()+
ggtitle("label and CORR")+theme(legend.position="none")+
xlab("label")+theme(plot.title = element_text(size = rel(1),
vjust = 1.5,face="bold.italic"))
G<-ggplot(train_val)+geom_boxplot(aes(x=factor(label), y= DF_angle,
color= factor(label)))+ theme_bw()+
theme(legend.position="none")+
ggtitle("label and DF_angle")+
xlab("label")+theme(plot.title = element_text(size = rel(1),
vjust = 1.5,face="bold.italic"))
H<-ggplot(train_val)+geom_boxplot(aes(x=factor(label), y= CF_angle,
color= factor(label)))+ theme_bw()+
theme(legend.position="none")+
ggtitle("label and CF_angle")+
xlab("label")+theme(plot.title = element_text(size = rel(1),
vjust = 1.5,face="bold.italic"))
I<-ggplot(train_val)+geom_boxplot(aes(x=factor(label), y= BF_angle,
color= factor(label)))+ theme_bw()+
theme(legend.position="none")+
ggtitle("label and BF_angle")+
xlab("label")+theme(plot.title = element_text(size = rel(1),
vjust = 1.5,face="bold.italic"))
J<-ggplot(train_val)+geom_boxplot(aes(x=factor(label), y= AF_angle,
color= factor(label)))+ theme_bw()+
theme(legend.position="none")+
ggtitle("label and AF_angle")+
xlab("label")+theme(plot.title = element_text(size = rel(1),
vjust = 1.5,face="bold.italic"))
K<-I<-ggplot(train_val)+geom_boxplot(aes(x=factor(label), y= AN_angle,
color= factor(label)))+ theme_bw()+
ggtitle("label and AN_angle")+theme(legend.position="none")+
xlab("label")+theme(plot.title = element_text(size = rel(1),
vjust = 1.5,face="bold.italic"))
plot_grid(A, B,C,D,E,G,H,I,J,K,nrow=3, align="h")
summary(lm(as.numeric(label)~NDAI, data = train_val))$r.squared
## [1] 0.5744323
summary(lm(as.numeric(label)~CORR, data = train_val))$r.squared
## [1] 0.3035037
summary(lm(as.numeric(label)~AF_angle, data = train_val))$r.squared
## [1] 0.257956
summary(lm(as.numeric(label)~x_coordinate, data = train_val))$r.squared
## [1] 0.3264572
summary(lm(as.numeric(label)~y_coordinate, data = train_val))$r.squared
## [1] 0.07865887
summary(lm(as.numeric(label)~SD, data = train_val))$r.squared
## [1] 0.1908709
summary(lm(as.numeric(label)~DF_angle, data = train_val))$r.squared
## [1] 0.0001069918
summary(lm(as.numeric(label)~CF_angle, data = train_val))$r.squared
## [1] 0.08052235
summary(lm(as.numeric(label)~AN_angle, data = train_val))$r.squared
## [1] 0.2553198
# Compute CV loss
seed = 123
K = 5
source("CVgeneric.R")
cv_result <- CVgeneric("logistic", c("y_coordinate", "x_coordinate",
"NDAI", "SD", "CORR", "DF_angle",
"CF_angle", "BF_angle", "AF_angle",
'AN_angle'), "label",
K, data=train_val, loss= accuracy, seed)
set.seed(cv_result$seed)
cv_result_method_2 <- CVgeneric("logistic", c("y_coordinate", "x_coordinate",
"NDAI", "SD", "CORR", "DF_angle",
"CF_angle", "BF_angle", "AF_angle",
'AN_angle'), "label",
K, data=method1_train_val, loss= accuracy, seed)
# Use data based on the best folds
train_data = train_val[-cv_result$index, ]
# Train logistic model
logistic_model<- train(label ~ ., data=train_data, method="glm", family="binomial")
# Test accuracy
predicted.classes <- logistic_model %>% predict(test_total[,-3])
mean(predicted.classes == test_total$label)
## [1] 0.8966023
# Other error metrics
# F1 Score
F1_Score(as.numeric(predicted.classes), as.numeric(test_total$label))
## [1] 0.9148915
# Sensitivity
Sensitivity(as.numeric(predicted.classes), as.numeric(test_total$label))
## [1] 0.9181057
# Find the best cut off point
predictions<- data.frame(predict(logistic_model, newdata =test_total[,-3],
type= "prob" ))
colnames(predictions) <- c(-1,1)
pred <- prediction(predictions$`1`, test_total$label)
perf <- performance(pred, "acc")
plot(perf, avg= "vertical", spread.estimate="boxplot",
show.spread.at= seq(0.1, 0.9, by=0.1),
main="Logistic regression cutoff and performance tradeoff")
index<-which.max(slot(perf, "y.values")[[1]])
max<-slot(perf, "x.values")[[1]][index]
perf <- performance(pred, "tpr", "fpr")
# ROC curve plot
plot(perf, colorize=T, print.cutoffs.at=c(max),
text.adj=c(1.5,0.2),
avg="threshold", lwd=3,
main= "Roc curve for logistic regression")
abline(a=0,b=1)
#### Model 2: LDA
# Compute CV loss
cv_result_lda<-CVgeneric("lda", c("y_coordinate", "x_coordinate",
"NDAI", "SD", "CORR", "DF_angle",
"CF_angle", "BF_angle", "AF_angle", 'AN_angle'),
"label",
K = 5, train_val, loss = precision, seed)
cv_result_lda_method_2<-CVgeneric("lda", c("y_coordinate", "x_coordinate",
"NDAI", "SD", "CORR", "DF_angle",
"CF_angle", "BF_angle", "AF_angle", 'AN_angle'),
"label",
K = 5, method1_train_val, loss = precision, seed)
# Use data based on the best folds
train_data = train_val[-cv_result_lda$index, ]
# Fit LDA Model
lda.model = lda(factor(label)~., data=train_val)
lda_pred<-predict(lda.model, newdata=test_total[,-3])
lda_pre2<-predict(lda.model, newdata=test_total[,-3], type= "prob")
#Other error metrics
#F1 Score
F1_Score(as.numeric(lda_pred$class), as.numeric(test_total$label))
## [1] 0.9165936
#Sensitivity
Sensitivity(as.numeric(lda_pred$class), as.numeric(test_total$label))
## [1] 0.9271657
#Accuracy of the model
mean(lda_pred$class == test_total$label)
## [1] 0.8994617
# Find the best cut off point
predictions<- data.frame(predict(lda.model, newdata =test_total[,-3] ))
colnames(predictions) <- c("label",-1,1)
pred <- prediction(predictions$`1`, test_total$label)
perf <- performance(pred, "acc")
plot(perf, avg= "vertical", spread.estimate="boxplot", show.spread.at= seq(0.1, 0.9, by=0.1), main="LDA cutoff and performance tradeoff")
index<-which.max(slot(perf, "y.values")[[1]])
max<-slot(perf, "x.values")[[1]][index]
# ROC Curve
perf <- performance(pred, "tpr", "fpr")
plot(perf, colorize=T, print.cutoffs.at=c(max), text.adj=c(1,0), avg="threshold", lwd=3, main= "LDA curve for ROC")
abline(a=0,b=1)
# Delete unused column
method1_train_val$picture<-NULL
# Compute CV loss for first method split
seed = 123
K = 5
cv_result_decision_tree <- CVgeneric("decision tree",
c("y_coordinate", "x_coordinate", "NDAI","SD","CORR","DF_angle","CF_angle","BF_angle","AF_angle",'AN_angle'),"label",K,data=train_val,loss= accuracy, seed)
# Use data based on the best folds for the first method split
train_data = train_val[-cv_result_decision_tree$index, ]
# Train the decision tree model
tree = rpart(label ~ ., data=train_data,maxdepth =10, minsplit= 10)
# Visualize the decision tree
fancyRpartPlot(tree)
# Train the decision tree model without the geographic features
tree_no_xy = rpart(label ~ NDAI+SD+CORR+DF_angle+CF_angle+BF_angle+AF_angle+AN_angle , data=train_data,maxdepth =10, minsplit= 10)
# Visualize the decision tree model without the geographic features
fancyRpartPlot(tree_no_xy)
# Accuracy for decision tree model without the geographic features
tree.pred_no_xy = predict(tree_no_xy, newdata=test_total[,-3],type = 'class')
mean(tree.pred_no_xy==test_total$label)
## [1] 0.9050846
# Accuracy for decision tree model WITH the geographic features
tree.pred = predict(tree, newdata=test_total[,-3],method = 'response')
tree.pred2 = predict(tree, newdata=test_total[,-3],type = 'class')
mean(tree.pred2==test_total$label)
## [1] 0.9222174
#Other error metrics
#F1 Score
F1_Score(as.numeric(tree.pred2), as.numeric(test_total$label))
## [1] 0.9336694
#Sensitivity
Sensitivity(as.numeric(tree.pred2), as.numeric(test_total$label))
## [1] 0.9722187
# Find the best cut off point
predictions<- data.frame(predict(tree, newdata =test_total[,-3], type= "prob" ))
colnames(predictions) <- c(-1,1)
pred <- prediction(predictions$`1`, test_total$label)
perf <- performance(pred, "acc")
plot(perf, avg= "vertical", spread.estimate="boxplot",
show.spread.at= seq(0.1, 0.9, by=0.1),
main="Decision Tree cutoff and performance tradeoff")
index<-which.max(slot(perf, "y.values")[[1]])
max<-slot(perf, "x.values")[[1]][index]
perf <- performance(pred, "tpr", "fpr")
# ROC Curve
plot(perf, colorize=T, print.cutoffs.at=max, text.adj=c(1,0), avg="threshold",
lwd=3,
main= "Roc curve for Decision Tree")
abline(a=0,b=1)
# Decision Tree feature importance (4a preparation)
tree_importance<-as.data.frame(varImp(tree))
tree_importance$importance<-rownames(tree_importance)
tree_importance <- tree_importance %>% arrange(desc(Overall))
# Compute CV loss for second method data split
cv_result_decision_tree_method2 <- CVgeneric("decision tree",
c("y_coordinate", "x_coordinate",
"NDAI", "SD", "CORR", "DF_angle",
"CF_angle", "BF_angle", "AF_angle",
'AN_angle'), "label",K,
data=method1_train_val,
loss= accuracy, seed)
train_data_tree_2 = method1_train_val[-cv_result_decision_tree_method2$index, ]
tree2 = rpart(label ~ ., data=train_data_tree_2,maxdepth =10, minsplit= 10)
tree.pred_method_2 <- predict(tree2, newdata=test_total[,-3],type = 'class')
seed = 123
K = 5
# Compute CV loss for first method split
cv_result_rf <- CVgeneric("random forest", c("y_coordinate",
"x_coordinate",
"NDAI", "SD", "CORR",
"DF_angle", "CF_angle",
"BF_angle", "AF_angle",
'AN_angle'), "label", K,
data=train_val, loss= accuracy, seed)
# Use the best folds on the training set
train_data_rf = train_val[-cv_result_rf$index, ]
# Choose the best hyperparameter of ntry which is the Number of variables randomly sampled as candidates at each split.
Random_forest_hyperparameter_tune<-tuneRF(x =train_data[,-3],
y = as.factor(train_data[,3]),
ntreeTry = 50)
## mtry = 3 OOB error = 0.46%
## Searching left ...
## mtry = 2 OOB error = 0.56%
## -0.2166124 0.05
## Searching right ...
## mtry = 6 OOB error = 0.36%
## 0.2084691 0.05
## mtry = 10 OOB error = 0.45%
## -0.2222222 0.05
# Plot the ntry
plot(data.frame(Random_forest_hyperparameter_tune)$mtry,
data.frame(Random_forest_hyperparameter_tune)$OOBError,
type = "l",ylab="OOBError",xlab="mtry",
main = "Random Forest mtry tune")
#Hyperparameter in R for random Forest
#ntree: Number of trees to grow.
# Fit the random forest model
rf_model <- randomForest(as.factor(label) ~ y_coordinate + x_coordinate +
NDAI + SD + CORR + DF_angle + CF_angle + BF_angle +
AF_angle + AN_angle, data = train_data_rf,
importance = TRUE,
ntree=50,
ntry=6,
maxnodes= 500,
nodesize = 10)
# Compute the accuracy of the model
predicted.classes_rf <- rf_model %>% predict(test_total[,-3])
mean(predicted.classes_rf == test_total$label)
## [1] 0.9847414
#Other error metrics
#F1 Score
F1_Score(as.numeric(predicted.classes_rf), as.numeric(test_total$label))
## [1] 0.9873865
#Sensitivity
Sensitivity(as.numeric(predicted.classes_rf), as.numeric(test_total$label))
## [1] 0.9951552
# Find the best cut off point
predictions<- data.frame(predict(rf_model, newdata =test_total[,-3], type= "prob" ))
colnames(predictions) <- c(-1,1)
pred <- prediction(predictions$`1`, test_total$label)
perf <- performance(pred, "acc")
plot(perf, avg= "vertical", spread.estimate="boxplot",
show.spread.at= seq(0.1, 0.9, by=0.1),
main="Random Foret cutoff and performance tradeoff")
index<-which.max(slot(perf, "y.values")[[1]])
max<-slot(perf, "x.values")[[1]][index]
perf <- performance(pred, "tpr", "fpr")
# ROC curve
plot(perf, colorize=T, print.cutoffs.at=c(max), text.adj=c(0.5,0),
avg="threshold", lwd=3, main= "Roc curve for Random Forest")
abline(a=0,b=1)
# Feature importance for random forest model
varImp(rf_model)
#CV method 2
cv_result_rf_method2 <- CVgeneric("random forest",
c("y_coordinate", "x_coordinate", "NDAI", "SD",
"CORR", "DF_angle", "CF_angle", "BF_angle",
"AF_angle", 'AN_angle'),
"label", K, data=method1_train_val,
loss= accuracy,
seed)
#Print accuracy result
#cv_result_rf_method2
#train data by using the best training sets
train_data_rf2 = method1_train_val[-cv_result_rf_method2$index, ]
rf_model_method_2 <- randomForest(as.factor(label) ~ y_coordinate +
x_coordinate + NDAI + SD + CORR +
DF_angle + CF_angle + BF_angle +AF_angle +
AN_angle,
data = train_data_rf2,
importance = TRUE,ntree=50,
ntry=6,maxnodes= 500, nodesize = 10)
predicted.classes_rf_method_2 <- rf_model_method_2 %>% predict(test_total[,-3])
# Decision Tree feature importance
ggplot(data=tree_importance, aes(x=reorder(importance, -Overall), y=Overall)) +
geom_bar(stat="identity", color="black", fill="lightgreen")+
theme(axis.text.x = element_text(angle = 90, hjust = 1))+
xlab("features")+
ylab("importance")+
ggtitle("Feature importance in Decision Tree")
# Balance the cost complexity and error
fit <- rpart(label~.,
method="anova", data=train_val)
printcp(fit) # display the results
##
## Regression tree:
## rpart(formula = label ~ ., data = train_val, method = "anova")
##
## Variables actually used in tree construction:
## [1] AN_angle CORR NDAI x_coordinate
##
## Root node error: 39557/166443 = 0.23766
##
## n= 166443
##
## CP nsplit rel error xerror xstd
## 1 0.643321 0 1.00000 1.00003 0.0011172
## 2 0.035724 1 0.35668 0.35749 0.0021904
## 3 0.020565 2 0.32095 0.32146 0.0021557
## 4 0.020420 3 0.30039 0.30118 0.0020043
## 5 0.015439 4 0.27997 0.28069 0.0018738
## 6 0.010000 5 0.26453 0.26563 0.0019048
plotcp(fit)
summary(fit)
## Call:
## rpart(formula = label ~ ., data = train_val, method = "anova")
## n= 166443
##
## CP nsplit rel error xerror xstd
## 1 0.64332087 0 1.0000000 1.0000287 0.001117176
## 2 0.03572423 1 0.3566791 0.3574913 0.002190419
## 3 0.02056532 2 0.3209549 0.3214584 0.002155653
## 4 0.02042008 3 0.3003896 0.3011766 0.002004315
## 5 0.01543850 4 0.2799695 0.2806913 0.001873813
## 6 0.01000000 5 0.2645310 0.2656329 0.001904752
##
## Variable importance
## NDAI SD CORR AN_angle AF_angle
## 25 18 15 15 13
## BF_angle x_coordinate CF_angle DF_angle
## 12 1 1 1
##
## Node number 1: 166443 observations, complexity param=0.6433209
## mean=1.388908, MSE=0.2376585
## left son=2 (90542 obs) right son=3 (75901 obs)
## Primary splits:
## NDAI < 0.7873264 to the left, improve=0.6433209, (0 missing)
## SD < 3.102224 to the left, improve=0.4642605, (0 missing)
## CORR < 0.197072 to the left, improve=0.4462696, (0 missing)
## x_coordinate < 229.5 to the right, improve=0.2655891, (0 missing)
## AN_angle < 176.503 to the right, improve=0.2638133, (0 missing)
## Surrogate splits:
## SD < 3.117993 to the left, agree=0.868, adj=0.710, (0 split)
## CORR < 0.1949221 to the left, agree=0.806, adj=0.575, (0 split)
## AN_angle < 181.8891 to the right, agree=0.782, adj=0.522, (0 split)
## AF_angle < 186.8958 to the right, agree=0.762, adj=0.478, (0 split)
## BF_angle < 244.978 to the right, agree=0.748, adj=0.447, (0 split)
##
## Node number 2: 90542 observations, complexity param=0.02042008
## mean=1.030903, MSE=0.0299478
## left son=4 (89372 obs) right son=5 (1170 obs)
## Primary splits:
## AN_angle < 172.344 to the right, improve=0.2978938, (0 missing)
## AF_angle < 177.9636 to the right, improve=0.1968573, (0 missing)
## NDAI < 0.3928416 to the left, improve=0.1692084, (0 missing)
## CORR < 0.2076106 to the left, improve=0.1624296, (0 missing)
## BF_angle < 190.1277 to the right, improve=0.1449216, (0 missing)
## Surrogate splits:
## AF_angle < 178.9546 to the right, agree=0.996, adj=0.674, (0 split)
## BF_angle < 189.9061 to the right, agree=0.992, adj=0.416, (0 split)
## CF_angle < 197.6788 to the right, agree=0.990, adj=0.211, (0 split)
## DF_angle < 209.3323 to the right, agree=0.989, adj=0.119, (0 split)
## y_coordinate < 21.5 to the right, agree=0.988, adj=0.068, (0 split)
##
## Node number 3: 75901 observations, complexity param=0.03572423
## mean=1.815971, MSE=0.1501625
## left son=6 (6966 obs) right son=7 (68935 obs)
## Primary splits:
## x_coordinate < 295.5 to the right, improve=0.12398610, (0 missing)
## CORR < 0.1898961 to the left, improve=0.11638770, (0 missing)
## y_coordinate < 45.5 to the left, improve=0.07716196, (0 missing)
## DF_angle < 261.395 to the left, improve=0.06285027, (0 missing)
## SD < 2.739462 to the left, improve=0.05186801, (0 missing)
## Surrogate splits:
## AF_angle < 278.0661 to the right, agree=0.909, adj=0.003, (0 split)
## AN_angle < 260.6509 to the right, agree=0.908, adj=0.003, (0 split)
## BF_angle < 300.31 to the right, agree=0.908, adj=0.000, (0 split)
## SD < 0.5655767 to the left, agree=0.908, adj=0.000, (0 split)
## CORR < -0.3280679 to the left, agree=0.908, adj=0.000, (0 split)
##
## Node number 4: 89372 observations
## mean=1.020096, MSE=0.01969194
##
## Node number 5: 1170 observations
## mean=1.85641, MSE=0.1229717
##
## Node number 6: 6966 observations, complexity param=0.0154385
## mean=1.386736, MSE=0.2371712
## left son=12 (4673 obs) right son=13 (2293 obs)
## Primary splits:
## AN_angle < 196.6854 to the left, improve=0.3696399, (0 missing)
## AF_angle < 210.1692 to the left, improve=0.3273585, (0 missing)
## BF_angle < 231.8952 to the left, improve=0.3106881, (0 missing)
## y_coordinate < 69.5 to the left, improve=0.2865935, (0 missing)
## CF_angle < 251.0203 to the left, improve=0.2776899, (0 missing)
## Surrogate splits:
## AF_angle < 210.0945 to the left, agree=0.949, adj=0.844, (0 split)
## BF_angle < 231.8145 to the left, agree=0.913, adj=0.736, (0 split)
## CF_angle < 250.7385 to the left, agree=0.874, adj=0.616, (0 split)
## y_coordinate < 77.5 to the left, agree=0.853, adj=0.553, (0 split)
## DF_angle < 264.0044 to the left, agree=0.846, adj=0.533, (0 split)
##
## Node number 7: 68935 observations, complexity param=0.02056532
## mean=1.859346, MSE=0.1208706
## left son=14 (24527 obs) right son=15 (44408 obs)
## Primary splits:
## CORR < 0.1976391 to the left, improve=0.09763239, (0 missing)
## SD < 2.612428 to the left, improve=0.04331985, (0 missing)
## DF_angle < 293.1815 to the left, improve=0.04170463, (0 missing)
## AF_angle < 227.3604 to the right, improve=0.04085155, (0 missing)
## NDAI < 3.325084 to the right, improve=0.03524672, (0 missing)
## Surrogate splits:
## AF_angle < 224.184 to the right, agree=0.718, adj=0.208, (0 split)
## AN_angle < 206.1113 to the right, agree=0.718, adj=0.206, (0 split)
## SD < 3.537287 to the left, agree=0.705, adj=0.170, (0 split)
## DF_angle < 281.0187 to the left, agree=0.691, adj=0.132, (0 split)
## y_coordinate < 276.5 to the right, agree=0.686, adj=0.118, (0 split)
##
## Node number 12: 4673 observations
## mean=1.179328, MSE=0.1471695
##
## Node number 13: 2293 observations
## mean=1.80942, MSE=0.1542593
##
## Node number 14: 24527 observations
## mean=1.713173, MSE=0.2045572
##
## Node number 15: 44408 observations
## mean=1.940078, MSE=0.05633103
rsq.rpart(fit)
##
## Regression tree:
## rpart(formula = label ~ ., data = train_val, method = "anova")
##
## Variables actually used in tree construction:
## [1] AN_angle CORR NDAI x_coordinate
##
## Root node error: 39557/166443 = 0.23766
##
## n= 166443
##
## CP nsplit rel error xerror xstd
## 1 0.643321 0 1.00000 1.00003 0.0011172
## 2 0.035724 1 0.35668 0.35749 0.0021904
## 3 0.020565 2 0.32095 0.32146 0.0021557
## 4 0.020420 3 0.30039 0.30118 0.0020043
## 5 0.015439 4 0.27997 0.28069 0.0018738
## 6 0.010000 5 0.26453 0.26563 0.0019048
#refer section 4d
# Long running time
# Fit adaboost model
adaboost<- adaboost(label~y_coordinate + x_coordinate + NDAI + SD + CORR + DF_angle + CF_angle+ BF_angle + AF_angle + AN_angle ,data=train_val, nIter=10)
# Adaboost model Accuracy
mean(predict(adaboost, newdata =test_total[,-3] )$class == test_total$label)
## [1] 0.9977413
# Fit adaboost model without geographic information
boost_no_xy<- boosting(label~NDAI + SD + CORR + DF_angle + CF_angle+ BF_angle + AF_angle + AN_angle ,data=train_val, mfinal = 10, boos=TRUE)
#Error based on number of trees
errorevol(boost_no_xy,train_val[,c(-1,-2)])->evol.train
errorevol(boost_no_xy,test_total[,c(-1,-2)])->evol.test
plot.errorevol(evol.test,evol.train)
importanceplot(boost_no_xy)
# Margins
BC.margins<-margins(boost_no_xy,train_val[,c(-1,-2)]) # training set
BC.adaboost.pred <- predict.boosting(boost_no_xy,newdata=test_total[,c(-1,-2)])
BC.predmargins<-margins(BC.adaboost.pred,test_total[,c(-1,-2)]) # test set
plot.margins(BC.predmargins,BC.margins,alpha=0.3 )
# Adaboost model Accuracy without geographic information
mean(predict(boost_no_xy, newdata =test_total[,-3] )$class == test_total$label)
## [1] 0.9340638
# Cut off point selection
predictions<- data.frame(predict(adaboost, newdata =test_total[,-3] )$prob)
colnames(predictions) <- c(-1,1)
pred <- prediction(predictions$`1`, test_total$label)
perf <- performance(pred, "acc")
plot(perf, avg= "vertical", spread.estimate="boxplot",
show.spread.at= seq(0.1, 0.9, by=0.1),
main="Adaboosting cutoff and performance tradeoff")
index<-which.max(slot(perf, "y.values")[[1]])
max<-slot(perf, "x.values")[[1]][index]
# ROC curve
perf <- performance(pred, "tpr", "fpr")
plot(perf, colorize=T, print.cutoffs.at=c(max), text.adj=c(0.3,0),
avg="threshold", lwd=3, main= "Adaboosting curve for ROC")
abline(a=0,b=1)
#Misclassification pattern for random forest
mis_label_random_forest<-train_data_rf[predicted.classes_rf!=test_total$label,]
ggplot(mis_label_random_forest)+
geom_point(alpha = 0.5, aes(x= x_coordinate, y = y_coordinate,
color = factor(label)))+
xlab(" x coordinate") + ylab(" y coordinate") +
ggtitle("Mis-classification in random-forest model split method 1 ")+
theme_bw()
ggplot(mis_label_random_forest)+geom_density(aes(x= CORR, group= label,
color=label, fill= label),
color= "black",alpha = 0.7)+
ggtitle("RF Overlaying histogram of CORR split by block")+
theme_minimal()
ggplot(mis_label_random_forest)+geom_density(aes(x= NDAI, group= label,
color=label, fill= label),
color= "black",alpha = 0.7)+
ggtitle("RF Overlaying histogram of NDAI split by block")+
theme_minimal()
ggplot(mis_label_random_forest)+geom_density(aes(x= SD, group= label,
color=label, fill= label),
color= "black",alpha = 0.7)+
ggtitle("RF Overlaying histogram of SD split by block")+
theme_minimal()
Predicted <-predicted.classes_rf
Actual<- test_total$label
fourfoldplot(table(Predicted, Actual))
#Method 2
mis_label_random_forest<- train_data_rf2[predicted.classes_rf_method_2!=test_total$label,]
ggplot(mis_label_random_forest)+ geom_point(alpha = 0.5,
aes(x= x_coordinate,y = y_coordinate,
color = factor(label)))+
xlab(" x coordinate") + ylab(" y coordinate")+
ggtitle("Mis-classification in random-forest model split method 2 ")+theme_bw()
Predicted <-predicted.classes_rf_method_2
fourfoldplot(table(Predicted, Actual))
#Misclassification pattern for decision tree
mis_label_decision_tree<- train_data[tree.pred2!=test_total$label,]
ggplot(mis_label_decision_tree)+ geom_point(alpha = 0.5,
aes(x= x_coordinate, y = y_coordinate,
color = factor(label)))+
xlab(" x coordinate") + ylab(" y coordinate") +
ggtitle("Mis-classification in decision tree model split method 1 ")+ theme_bw()
Predicted <-tree.pred2
fourfoldplot(table(Predicted, Actual))
ggplot(mis_label_decision_tree)+geom_density(aes(x= CORR, group= label,
color=label, fill= label),
color= "red",alpha = 0.7)+
ggtitle("DT Overlaying histogram of CORR split by block")+
theme_minimal()
ggplot(mis_label_decision_tree)+geom_density(aes(x= NDAI, group= label,
color=label, fill= label),
color= "red",alpha = 0.7)+
ggtitle("DT Overlaying histogram of NDAI split by block")+
theme_minimal()
ggplot(mis_label_decision_tree)+geom_density(aes(x= SD, group= label,
color=label, fill= label),
color= "red",alpha = 0.7)+
ggtitle("DT Overlaying histogram of SD split by block")+
theme_minimal()
#Method 2
mis_label_decision_tree2<- train_data_tree_2[tree.pred_method_2!=test_total$label,]
ggplot(mis_label_decision_tree2)+ geom_point(alpha = 0.5, aes(x= x_coordinate, y = y_coordinate, color = factor(label)))+
xlab(" x coordinate") + ylab(" y coordinate") +
ggtitle("Mis-classification in decision tree model split method 2 ")+ theme_bw()
Predicted <-tree.pred_method_2
fourfoldplot(table(Predicted, Actual))